Analyzing BANKSY Embedding for Niche Identities

Author

Peter Youyun Zheng

Published

August 16, 2024

Introduction

In this doc, we will be looking at the banksy clusters and try to interpret them based on their identity, and other characteristics.

Load the data

The objects are here:

RDS object with Leiden Clusters: /xchip/beroukhimlab/coja/Spatial_PLGG/data/Xenium/Xenium_Objects/total_spatial_banksy_clusters_20240719_092550.rds

-   `/xchip/beroukhimlab/coja/Spatial_PLGG/data/Xenium/Xenium_Objects/total_spatial_banksy_embeddings/total_spatial_banksy_clusters_subset_20240719_092550.rds`

Cell type clusters annotation: /xchip/beroukhimlab/coja/Spatial_PLGG/data/Xenium/banksy/banksy_cell_cluster_annotation.tsv

Code
# load the banksy embeddings
banksy_embeddings = readRDS(ifelse(
    workdir == "/xchip/beroukhimlab/",
    paste0(workdir,'coja/Spatial_PLGG/data/Xenium/Xenium_Objects/total_spatial_banksy_clusters_subset_20240719_092550.rds'),
    '/Users/youyun/Documents/HMS/PhD/beroukhimlab/plgg/data/total_spatial_banksy_clusters_subset_20240719_092550.rds'
))
colData(banksy_embeddings)$sample_id = factor(colData(banksy_embeddings)$sample_id)

# Load the cell type clusters annotation
prelim_annotation_res1 = fread(paste0(
    workdir,'coja/Spatial_PLGG/data/Xenium/banksy/banksy_cell_cluster_annotation.tsv'
))

# specify the cluster names 
# this is mainly to keep track of which resolution we are using
res = 1
cell_clust_name = gsub('.0$','',sprintf('clust_Harmony_BANKSY_lam0.2_k50_res%.1f',res))
niche_clust_name = gsub('lam0.2','lam0.8',cell_clust_name)

# # filter the banksy object to only include the cells with annotated cell types
# annotated_cell_clusters = sort(unique(prelim_annotation_res1$cluster))
# banksy_embeddings = banksy_embeddings[,colData(banksy_embeddings)[,cell_clust_name] %in% annotated_cell_clusters]

Setting up Neighborhood Analysis

Here we will look at the niches in the data and annotate them using the cell types of self and neighbors.

Code
k_geom = 15
locs = spatialCoords(banksy_embeddings)
knn = dbscan::kNN(x = locs, k = k_geom)
knnDT <- data.table(
    from = rep(seq_len(nrow(knn$id)), k_geom),
    from_id = rep(rownames(knn$id), k_geom),
    to = as.vector(knn$id),
    to_id = rownames(knn$id)[as.vector(knn$id)],
    distance = as.vector(knn$dist)
)
cell_cell_type_dt = data.table(
    cell_id = rownames(colData(banksy_embeddings)),
    cell_type = as.numeric(colData(banksy_embeddings)[,cell_clust_name]),
    niche_type = as.numeric(colData(banksy_embeddings)[,niche_clust_name]),
    sample_id = colData(banksy_embeddings)$sample_id,
    histology = colData(banksy_embeddings)$histology
)

cell_type_cols = setdiff(colnames(prelim_annotation_res1),'cluster')
niche_self_neighbor_dt = knnDT[,c('from','to') := NULL][
    cell_cell_type_dt[,.(cell_id,cell_type)], on = c('to_id' = 'cell_id'),
    cell_type_to := cell_type
][cell_cell_type_dt, on = c('from_id' = 'cell_id')][
    ,.(
    cell_id = from_id, cell_type, niche_type, sample_id, 
    histology, neighbor_cell_type = cell_type_to
)][
    prelim_annotation_res1, on = c('cell_type' = 'cluster')
][
    prelim_annotation_res1, on = c('neighbor_cell_type' = 'cluster'),
    paste0(cell_type_cols,' Neighbors') := mget(paste0('i.',cell_type_cols))
][order(cell_id)]
niche_self_dt = unique(niche_self_neighbor_dt[
    ,mget(c('cell_id','cell_type','niche_type','sample_id','histology',cell_type_cols))
])

Build the color palette for all niches and cell types downstream

Code
cluster_classes = sort(unique(niche_self_neighbor_dt$niche_type))
niche_color_manual = structure(
    pals::polychrome()[c(1:length(cluster_classes))],
    names = cluster_classes
)

cell_type_to_test = 'Coarse Cell Type'
cell_type_to_test = 'Fine Cell Type'
cell_type_to_test = 'Fine Cell Type UMAP'

cell_type_class = sort(unique(prelim_annotation_res1[[cell_type_to_test]]))
cell_type_color_manual = structure(
    pals::alphabet()[c(1:length(cell_type_class))],
    names = cell_type_class
)

Looking at the niches in space

Code
graph_dt = melt(
    as.data.table(cbind(
        spatialCoords(banksy_embeddings),
        colData(banksy_embeddings)[c(
            'sample_id','cell_id', 'histology',
            grep('Harmony.*.res1$',clusterNames(banksy_embeddings), value = TRUE)
        )]
    )),id.vars = c('sample_id','cell_id','histology','sdimx','sdimy'),
    variable.name = 'Cluster Type',value.name = 'Cluster ID'
)

g = ggplot(
    graph_dt[`Cluster Type` == 'clust_Harmony_BANKSY_lam0.8_k50_res1'], 
    aes(x=sdimy, y=sdimx, color=`Cluster ID`)
) +
    facet_wrap(histology ~ sample_id, nrow = 2, scale = 'free_x') +
    geom_point(size = 0.1, alpha = 0.5) + 
    scale_color_manual(values = niche_color_manual) +
    theme_classic() + 
    theme(
        legend.position = "bottom",
        axis.text.x=element_blank(),
        axis.text.y=element_blank(),
        axis.ticks=element_blank(),
        text = element_text(size = 15),
        strip.text = element_text(size = 10),
        strip.background = element_blank()
    ) + guides(
        color = guide_legend(nrow = 3, byrow = TRUE)
    ) +
    coord_flip()
g

Code
# p = ggplotly(g)
# saveWidget(p, paste0(workdir,'/youyun/plgg/code/niche/niche_identity_spatial.html'), selfcontained = TRUE)

What are the niches?

What is the distribution of niches?

Across samples and histologies

Code
ggplot(niche_self_dt) + 
    geom_bar(aes(x = sample_id, fill = as.factor(niche_type)), position = 'fill') + 
    facet_wrap(~histology, scale = 'free_x') +
    theme_minimal() + theme(
        axis.text.x = element_text(angle = 45, hjust = 1),
        text = element_text(size = 15), legend.position = 'bottom'
    ) + scale_fill_manual(values = niche_color_manual) +
    labs(
        title = 'Distribution of Niches Across Samples',
        x = 'Sample ID',
        y = 'Proportion of Cells'
    ) + guides(
        fill = guide_legend(title = 'Niche Type', nrow = 2)
    )
ggplot(niche_self_dt) + 
    geom_bar(aes(x = sample_id, fill = as.factor(niche_type)), position = 'stack') + 
    facet_wrap(~histology, scale = 'free') +
    theme_minimal() + theme(
        axis.text.x = element_text(angle = 45, hjust = 1),
        text = element_text(size = 15), legend.position = 'bottom'
    ) + scale_fill_manual(values = niche_color_manual) +
    labs(
        title = 'Number of Niches Across Samples',
        x = 'Sample ID',
        y = 'Proportion of Cells'
    ) + guides(
        fill = guide_legend(title = 'Niche Type', nrow = 2)
    )

Are there niches exclusive or prevalent across samples and histologies?

Some statistical testing

Code
histology_niche_enrichment = pairwise_categorical_enrichment_test(
    niche_self_dt$niche_type, niche_self_dt$histology
)
setnames(
    histology_niche_enrichment, 1:2,
    c('niche_type', 'histology')
)
ggplot(histology_niche_enrichment) +
    geom_point(aes(
        x = as.numeric(niche_type), y = histology, size = N_var1var2, 
        # color by odds_ratios column capped at percentile 95
        fill = pmin(
            quantile(log(odds_ratios), 0.95, na.rm = TRUE), 
            log(odds_ratios), na.rm = TRUE
        ), color = ifelse(
            q_value < 0.05 & odds_ratios > 1, 
            'significant','not significant'
        )
    ), shape = 21, stroke = 1) +
    theme(
        text = element_text(size = 15), legend.position = 'right'
    ) + scale_fill_viridis_c() +
    scale_x_continuous(breaks = as.numeric(unique(histology_niche_enrichment$niche_type))) +
    scale_size_area(max_size = 10) +
    scale_color_manual(values = c('significant' = 'red','not significant' = 'black')) +
    labs(
        title = 'Enrichment of Niches Across Histologies',
        x = 'Niche Type',
        y = 'Histology',
        size = 'Number of Niches',
        fill = 'log OR',
        color = 'Significance'
    ) + guides(
        fill = guide_legend(title = 'log OR'),
        color = guide_legend(title = 'Significance')
    )

Is each niche exclusive to one self-cell-type?

Code
ggplot(niche_self_dt) + 
    geom_bar(aes(x = niche_type, fill = as.factor(get(cell_type_to_test))), position = 'fill') + 
    theme_minimal() + theme(
        axis.text.x = element_text(angle = 45, hjust = 1),
        text = element_text(size = 15), legend.position = 'bottom'
    ) + scale_fill_manual(values = cell_type_color_manual) +
    labs(
        title = 'Distribution of Self Cell Type By Niche',
        x = 'Niche Type',
        y = 'Proportion of Cell Type'
    ) + guides(
        fill = guide_legend(title = cell_type_to_test, nrow = 3)
    )

Code
ggplot(
    unique(niche_self_neighbor_dt[
        ,mget(c('cell_id','cell_type','niche_type','sample_id','histology',cell_type_cols))
    ])
) + geom_bar(aes(x = niche_type, fill = as.factor(get(cell_type_to_test))), position = 'stack') + 
    theme_minimal() + theme(
        axis.text.x = element_text(angle = 45, hjust = 1),
        text = element_text(size = 15), legend.position = 'bottom'
    ) + scale_fill_manual(values = cell_type_color_manual) +
    labs(
        title = 'Distribution of Self Cell Type By Niche',
        x = 'Niche Type',
        y = 'Proportion of Cell Type'
    ) + guides(
        fill = guide_legend(title = cell_type_to_test, nrow = 3)
    )

Some statistical testing

Code
self_cell_type_niche_enrichment = pairwise_categorical_enrichment_test(
    niche_self_dt$niche_type, niche_self_dt[[cell_type_to_test]]
)
setnames(
    self_cell_type_niche_enrichment, 1:2,
    c('niche_type', 'cell_type')
)

ggplot(self_cell_type_niche_enrichment) +
    geom_point(aes(
        x = as.numeric(niche_type), y = cell_type, size = N_var1var2, 
        # color by odds_ratios column capped at percentile 95
        fill = pmin(
            quantile(log(odds_ratios), 0.95, na.rm = TRUE), 
            log(odds_ratios), na.rm = TRUE
        ), color = ifelse(
            q_value < 0.05 & odds_ratios > 1, 
            'significant','not significant'
        )
    ), shape = 21, stroke = 1) +
    theme(
        text = element_text(size = 15), legend.position = 'right'
    ) + scale_fill_viridis_c() +
    scale_x_continuous(breaks = as.numeric(unique(self_cell_type_niche_enrichment$niche_type))) +
    scale_size_area(max_size = 10) +
    scale_color_manual(values = c('significant' = 'red','not significant' = 'black')) +
    labs(
        title = 'Enrichment of Self Cell Type By Niche',
        x = 'Niche Type',
        y = 'Cell Type',
        size = 'Number of Niches',
        fill = 'log OR',
        color = 'Significance'
    ) + guides(
        fill = guide_legend(title = 'log OR'),
        color = guide_legend(title = 'Significance')
    )

Does each niche represent a group of cells that are isotropic or anisotropic?

Code
ggplot(
    niche_self_neighbor_dt[
        ,.(proportion = .N/15), .(
            cell_id, niche_type, 
            neighbor_cell_type = get(paste0(cell_type_to_test,' Neighbors'))
        )
    ]
) + geom_boxplot(aes(
    x = neighbor_cell_type, y = proportion, 
    fill = neighbor_cell_type
)) +
    facet_wrap(~niche_type) +
    theme_minimal() + theme(
        axis.text.x = element_text(angle = 45, hjust = 1),
        text = element_text(size = 15), legend.position = 'bottom'
    ) + scale_fill_manual(values = cell_type_color_manual) +
    labs(
        title = 'Distribution of Neighbor Cell Type By Niche',
        x = 'Neighbor Cell Type',
        y = 'Proportion of Neighbor Cell Type'
    ) + guides(
        fill = guide_legend(title = cell_type_to_test, nrow = 3)
    )

Some statistical testing

Code
neighbor_cell_type_niche_enrichment = pairwise_categorical_continuous_enrichment_test(dcast(
    niche_self_neighbor_dt[
        !is.na(get(paste0(cell_type_to_test,' Neighbors'))),.(proportion = .N/15), .(
            cell_id, niche_type, 
            neighbor_cell_type = get(paste0(cell_type_to_test,' Neighbors'))
        )
    ],
    cell_id + niche_type ~ neighbor_cell_type, 
    value.var = 'proportion', fill = 0
))
ggplot(neighbor_cell_type_niche_enrichment) +
    geom_point(aes(
        x = as.numeric(niche_type), y = continuous_cols, size = N_var2,
        fill = delta, 
        color = ifelse(q_value < 0.05 & as.numeric(delta) > 0, 'significant','not significant')
    ), shape = 21, stroke = 1) + 
    theme(
        text = element_text(size = 15), legend.position = 'right'
    ) + scale_fill_viridis_c() +
    scale_x_continuous(breaks = as.numeric(unique(neighbor_cell_type_niche_enrichment$niche_type))) +
    scale_size_area(max_size = 10) +
    scale_color_manual(values = c('significant' = 'red','not significant' = 'black')) +
    labs(
        title = 'Enrichment of Neighbor Cell Type By Niche',
        x = 'Niche Type',
        y = 'Neighbor Cell Type',
        size = '% Neighboring Cells',
        fill = 'Delta',
        color = 'Significance'
    ) + guides(
        fill = guide_legend(title = 'Delta'),
        color = guide_legend(title = 'Significance')
    )

[1] "The following categories are left out because there are less than 30 observations: 18, 21, 23, 24, 25, 28"

Pulling everything together to understand the niches

Code
library(scales)
including_niches = intersect(
    intersect(
        unique(histology_niche_enrichment$niche_type),
        unique(self_cell_type_niche_enrichment$niche_type)
    ),
    unique(neighbor_cell_type_niche_enrichment$niche_type) 
)
enrichment_overview_dt = merge(
    rbind(
        self_cell_type_niche_enrichment[
            niche_type %in% including_niches,
            .(cell_type, size = N_var1var2/sum(N_var1var2), q_value, test_statistic = odds_ratios),
            .(niche_type)
        ][, data_type := 'Self Cell Type'],
        neighbor_cell_type_niche_enrichment[
            niche_type %in% including_niches,
            .(cell_type = continuous_cols, size = N_var2, q_value, test_statistic = delta),
            .(niche_type)
        ][, data_type := 'Neighbor Cell Type']
    ),histology_niche_enrichment[
        niche_type %in% including_niches
        ,.(enriched_histologies = paste0(
            histology[q_value < 0.05 & odds_ratios > 1], collapse = ',\n'
        )), .(niche_type)
    ], by = 'niche_type'
)

ggplot(enrichment_overview_dt) + geom_point(aes(
    x = data_type, y = cell_type, size = size, 
    # fill = q_value, 
    color = ifelse(
        # if it's the fishers test, we use a odds ratio > 1 as positive enrichment
        (q_value < 0.05 & data_type == 'Self Cell Type' & test_statistic > 1) |
        # if it's the continuous test, we use a delta > 0 as positive enrichment
            (q_value < 0.05 & data_type == 'Neighbor Cell Type' & test_statistic > 0), 
        'significant','not significant'
    )
), shape = 21, stroke = 1) + 
    facet_wrap(as.numeric(niche_type) ~ enriched_histologies, nrow = 3) +
    theme(
        axis.text.x = element_text(angle = 45, hjust = 1),
        text = element_text(size = 20), 
        legend.position = 'bottom',
        strip.text = element_text(size = 15),
        strip.background = element_blank(),
        panel.spacing = unit(1, "lines")
    ) + #scale_fill_viridis_c()  +
    scale_size_area(max_size = 10) +
    scale_x_discrete(
        labels = c('Self','Neighbor'),
        limits = c('Self Cell Type','Neighbor Cell Type')
    ) +
    scale_color_manual(values = c('significant' = 'red','not significant' = 'black')) +
    labs(
        title = 'Enrichment of Cell Types By Niche',
        x = 'Niche Type',
        y = 'Cell Type',
        size = 'Proportion Cell Type',
        fill = 'q-value',
        color = 'Significance'
    ) + guides(
        fill = guide_legend(title = 'q-value'),
        color = guide_legend(title = 'Significance')
    )

Code
ggplot(enrichment_overview_dt) + geom_point(aes(
    x = data_type, y = cell_type, size = size, 
    # fill = q_value, 
    color = ifelse(
        # if it's the fishers test, we use a odds ratio > 1 as positive enrichment
        (q_value < 0.05 & data_type == 'Self Cell Type' & test_statistic > 1) |
        # if it's the continuous test, we use a delta > 0 as positive enrichment
            (q_value < 0.05 & data_type == 'Neighbor Cell Type' & test_statistic > 0), 
        'significant','not significant'
    )
), shape = 21, stroke = 1) + 
    facet_wrap(enriched_histologies ~ as.numeric(niche_type), nrow = 3) +
    theme(
        axis.text.x = element_text(angle = 45, hjust = 1),
        text = element_text(size = 20), 
        legend.position = 'bottom',
        strip.text = element_text(size = 15),
        strip.background = element_blank(),
        panel.spacing = unit(1, "lines")
    ) + #scale_fill_viridis_c()  +
    scale_size_area(max_size = 10) +
    scale_x_discrete(
        labels = c('Self','Neighbor'),
        limits = c('Self Cell Type','Neighbor Cell Type')
    ) +
    scale_color_manual(values = c('significant' = 'red','not significant' = 'black')) +
    labs(
        title = 'Enrichment of Cell Types By Niche',
        x = 'Niche Type',
        y = 'Cell Type',
        size = 'Proportion Cell Type',
        fill = 'q-value',
        color = 'Significance'
    ) + guides(
        fill = guide_legend(title = 'q-value'),
        color = guide_legend(title = 'Significance')
    )